A Table based Q-Learning Reinforcement Agent in A Grid World

This is a simple example of a Q-Learning agent. The Q function is a table, and each decision is made by sampling the Q-values for a particular state thermally.



In [1]:

    
import numpy as np
import random
import gym

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

from IPython.display import clear_output
from tqdm import tqdm



In [2]:

    
env = gym.make('FrozenLake-v0')









    



[2017-03-24 09:58:26,166] Making new env: FrozenLake-v0



In [40]:

    
Q = np.zeros([env.observation_space.n, env.action_space.n])



In [60]:

    
# Set learning parameters
decision_temperature = 0.01
l_rate = 0.5
y = .99
e = 0.1
num_episodes = 900

# create lists to contain total rewawrds and steps per episode

epi_length = []
rs = []

for i in tqdm(range(num_episodes)):
    s = env.reset()
    r_total = 0
    done = False
    number_jumps = 0

    # limit numerb of jumps
    while number_jumps < 99:
        number_jumps += 1

        softmax = np.exp(Q[s]/decision_temperature)
        rand_n = np.random.rand() * np.sum(softmax)
        
        # pick the next action randomly
        acc = 0
        for ind in range(env.action_space.n):
            acc += softmax[ind]
            if acc >= rand_n:
                a = ind
                break
                
        #print(a, softmax, rand_n)
        
        # a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1./(i+1)))

        s_next, r, done, _ = env.step(a)


        Q_next_value = Q[s_next]


        max_Q_next = np.max(Q[s_next,:])

        # now update Q
        Q[s, a] += l_rate * (r + y * max_Q_next \
                                            - Q[s, a])

        r_total += r
        s = s_next
        if done:
            # be more conservative as we learn more
            e = 1./((i/50) + 10)
            break

    if i%900 == 899:

        clear_output(wait=True)
        print("success rate: " + str(sum(rs[-200:])/2) + "%")

        plt.figure(figsize=(8, 8))
        plt.subplot(211)
        plt.title("Jumps Per Episode", fontsize=18)
        plt.plot(epi_length[-200:], "#23aaff")
        plt.subplot(212)
        plt.title('Reward For Each Episode (0/1)', fontsize=18)
        plt.plot(rs[-200:], "o", color='#23aaff', alpha=0.1)
        
        plt.figure(figsize=(6, 6))
        plt.title('Decision Table', fontsize=18)
        plt.xlabel("States", fontsize=15)
        plt.ylabel('Actions', fontsize=15)
        plt.imshow(Q.T)
        plt.show()

    epi_length.append(number_jumps)
    rs.append(r_total)









    



success rate: 71.5%






    












    












    



100%|██████████| 900/900 [00:01<00:00, 546.26it/s]



In [52]:

    
def mv_avg(xs, n):
    return [sum(xs[i:i+n])/n for i in range(len(xs)-n)]
# plt.plot(mv_avg(rs, 200))



In [55]:

    
plt.figure(figsize=(8, 8))
plt.subplot(211)
plt.title("Jumps Per Episode", fontsize=18)
plt.plot(epi_length, "#23aaff", linewidth=0.1, alpha=0.7,
        label="raw data")
plt.plot(mv_avg(epi_length, 200), color="blue", alpha=0.3, linewidth=4, 
         label="Moving Average")
plt.legend(loc=(1.05, 0), frameon=False, fontsize=15)
plt.subplot(212)
plt.title('Reward For Each Episode (0/1)', fontsize=18)
#plt.plot(rs, "o", color='#23aaff', alpha=0.2, markersize=0.4, label="Reward")
plt.plot(mv_avg(rs, 200), color="red", alpha=0.5, linewidth=4, label="Moving Average")
plt.ylim(-0.1, 1.1)
plt.legend(loc=(1.05, 0), frameon=False, fontsize=15)
plt.savefig('./figures/Frozen-Lake-v0-thermal-table.png', dpi=300, bbox_inches='tight')



In [ ]:



In [ ]: